Análisis de https://www.nature.com/articles/srep00196.pdf

Podemos usar read_lines_chunked si el archivo original es grande. En este ejemplo, filtramos las recetas Latin American:

library(tidyverse)
limpiar <- function(lineas, ...){
  str_split(lineas, ',') |>  
    keep(~.x[1] == 'LatinAmerican') |> 
    map(~.x[-1]) |>  # quitar tipo de cocina
    map(~.x[nchar(.x) > 0]) # quitar elementos vacios
}
callback_limpiar <- ListCallback$new(limpiar)
filtrado <- read_lines_chunked('../datos/recetas/srep00196-s3.csv',
                    skip = 1, callback = callback_limpiar, chunk_size = 1000)
recetas <-  filtrado |> flatten()
recetas[1:10]
## [[1]]
## [1] "tomato"            "cilantro"          "lemon_juice"      
## [4] "onion"             "green_bell_pepper" "cayenne"          
## 
## [[2]]
##  [1] "olive_oil"     "pepper"        "lemon_juice"   "wheat"        
##  [5] "onion"         "vinegar"       "asparagus"     "parsley"      
##  [9] "white_wine"    "garlic"        "bell_pepper"   "oregano"      
## [13] "basil"         "vegetable_oil" "chicken"      
## 
## [[3]]
## [1] "tomato"     "pepper"     "potato"     "lime_juice" "cayenne"   
## [6] "cumin"      "scallion"   "cilantro"   "cream"     
## 
## [[4]]
##  [1] "tomato"        "beef"          "onion"         "cayenne"      
##  [5] "corn"          "black_pepper"  "cumin"         "buttermilk"   
##  [9] "bean"          "garlic"        "bell_pepper"   "oregano"      
## [13] "vegetable_oil" "egg"           "cream"        
## 
## [[5]]
## [1] "tomato"      "garlic"      "onion"       "beef"        "cayenne"    
## [6] "cumin"       "bell_pepper"
## 
## [[6]]
##  [1] "olive_oil"         "cilantro"          "wheat"            
##  [4] "onion"             "cayenne"           "cumin"            
##  [7] "lettuce"           "garlic"            "bell_pepper"      
## [10] "soybean"           "mozzarella_cheese" "lime"             
## [13] "turmeric"         
## 
## [[7]]
## [1] "butter"  "cheese"  "ham"     "onion"   "potato"  "cayenne" "garlic" 
## [8] "tomato" 
## 
## [[8]]
##  [1] "cane_molasses"  "olive_oil"      "pepper"         "red_wine"      
##  [5] "tabasco_pepper" "seed"           "cucumber"       "oregano"       
##  [9] "olive"          "tamarind"       "bread"          "tomato"        
## [13] "vinegar"        "lemon"          "onion"          "parsley"       
## [17] "tomato_juice"   "garlic"         "beef_broth"     "egg"           
## 
## [[9]]
## [1] "butter"  "tomato"  "garlic"  "onion"   "chicken" "rice"    "cayenne"
## 
## [[10]]
##  [1] "cane_molasses" "butter"        "lemon_juice"   "wheat"        
##  [5] "apple"         "plum"          "milk"          "pecan"        
##  [9] "cinnamon"      "orange"        "tequila"       "cream"
length(recetas)
## [1] 2917
library(arules)
length(recetas)
## [1] 2917
## No hacer mucho más chico que este soporte, pues tenemos relativamente
## pocas transacciones:
pars <- list(support = 0.05,  target = 'frequent itemsets',
             ext = TRUE)
ap_recetas <- apriori(recetas, parameter = pars)
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##          NA    0.1    1 none FALSE            TRUE       5    0.05      1
##  maxlen            target  ext
##      10 frequent itemsets TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 145 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[260 item(s), 2917 transaction(s)] done [0.00s].
## sorting and recoding items ... [37 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 7 done [0.00s].
## sorting transactions ... done [0.00s].
## writing ... [759 set(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
length(ap_recetas)
## [1] 759

Vemos los items frecuentes

ap_1 <- subset(ap_recetas, size(ap_recetas) == 1)
frecs <- ap_1 |> sort(by = 'support') |> DATAFRAME()
DT::datatable(frecs |> mutate_if(is.numeric, function(x) round(x, 3)))

Y ahora examinamos combinaciones frecuentes de distintos tamaños

ap_2 <- subset(ap_recetas, size(ap_recetas) == 2)
ap_2 |> 
  subset(support > 0.20) |>
  sort(by = 'support') |>
  inspect()
##      items             support   count
## [1]  {cayenne, onion}  0.5128557 1496 
## [2]  {garlic, onion}   0.4991430 1456 
## [3]  {cayenne, garlic} 0.4844018 1413 
## [4]  {onion, tomato}   0.4586904 1338 
## [5]  {cayenne, tomato} 0.4381214 1278 
## [6]  {garlic, tomato}  0.4220089 1231 
## [7]  {cumin, garlic}   0.2451148  715 
## [8]  {cayenne, cumin}  0.2430579  709 
## [9]  {cumin, onion}    0.2358588  688 
## [10] {cayenne, corn}   0.2324306  678 
## [11] {corn, onion}     0.2159753  630 
## [12] {corn, tomato}    0.2015770  588

Incluso hay algunas combinaciones de 4 ingredientes que ocurren con frecuencia alta: estos ingredientes son bases de salsas, combinaciones de condimentos:

ap_4 <- subset(ap_recetas, size(ap_recetas) == 4)
ap_4 |> 
  subset(support > 0.10) |>
  sort(by = 'support') |>
  inspect()
##      items                                    support   count
## [1]  {cayenne, garlic, onion, tomato}         0.3345903 976  
## [2]  {cayenne, cumin, garlic, onion}          0.1960919 572  
## [3]  {cumin, garlic, onion, tomato}           0.1642098 479  
## [4]  {cayenne, cumin, onion, tomato}          0.1631814 476  
## [5]  {cayenne, cumin, garlic, tomato}         0.1624957 474  
## [6]  {cayenne, corn, garlic, onion}           0.1580391 461  
## [7]  {corn, garlic, onion, tomato}            0.1467261 428  
## [8]  {cayenne, corn, onion, tomato}           0.1463833 427  
## [9]  {cayenne, corn, garlic, tomato}          0.1443264 421  
## [10] {cayenne, garlic, onion, oregano}        0.1343846 392  
## [11] {black_pepper, cayenne, garlic, onion}   0.1278711 373  
## [12] {cayenne, garlic, onion, vegetable_oil}  0.1227288 358  
## [13] {cayenne, cilantro, garlic, onion}       0.1182722 345  
## [14] {bell_pepper, cayenne, garlic, onion}    0.1162153 339  
## [15] {garlic, onion, tomato, vegetable_oil}   0.1145012 334  
## [16] {cayenne, onion, tomato, vegetable_oil}  0.1127871 329  
## [17] {cayenne, cumin, garlic, oregano}        0.1114158 325  
## [18] {cumin, garlic, onion, oregano}          0.1107302 323  
## [19] {cayenne, cilantro, onion, tomato}       0.1100446 321  
## [20] {cayenne, cumin, onion, oregano}         0.1083305 316  
## [21] {cayenne, garlic, tomato, vegetable_oil} 0.1062736 310  
## [22] {bell_pepper, garlic, onion, tomato}     0.1045595 305  
## [23] {cayenne, cilantro, garlic, tomato}      0.1042167 304  
## [24] {cayenne, cheddar_cheese, garlic, onion} 0.1038738 303  
## [25] {garlic, onion, oregano, tomato}         0.1038738 303  
## [26] {cayenne, cheese, garlic, onion}         0.1035310 302  
## [27] {black_pepper, garlic, onion, tomato}    0.1031882 301  
## [28] {bell_pepper, cayenne, onion, tomato}    0.1025026 299  
## [29] {cilantro, garlic, onion, tomato}        0.1021598 298  
## [30] {cayenne, cheddar_cheese, onion, tomato} 0.1018169 297  
## [31] {black_pepper, cumin, garlic, onion}     0.1018169 297  
## [32] {black_pepper, cayenne, onion, tomato}   0.1011313 295  
## [33] {cayenne, garlic, oregano, tomato}       0.1007885 294  
## [34] {cayenne, onion, oregano, tomato}        0.1004457 293

Extracción de reglas

pars <- list(support = 0.01, confidence = 0.10,
             target = 'rules',
             ext = TRUE)
reglas_recetas <- apriori(recetas, parameter = pars)
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.1    0.1    1 none FALSE            TRUE       5    0.01      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 29 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[260 item(s), 2917 transaction(s)] done [0.00s].
## sorting and recoding items ... [101 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 7 8 9 10
## Warning in apriori(recetas, parameter = pars): Mining stopped (maxlen reached).
## Only patterns up to a length of 10 returned!
##  done [0.02s].
## writing ... [58318 rule(s)] done [0.01s].
## creating S4 object  ... done [0.01s].
agregar_hyperlift <- function(reglas, trans){
  quality(reglas) <- cbind(quality(reglas), 
    hyper_lift = interestMeasure(reglas, measure = "hyperLift", 
    transactions = trans))
  reglas
}
reglas_recetas <- agregar_hyperlift(reglas_recetas, recetas)

Análisis de pares comunes

library(arulesViz)
reglas_1 <- subset(reglas_recetas, hyper_lift > 1.1 & support > 0.1 & confidence > 0.40)
length(reglas_1)
## [1] 341
reglas_tam_2 <- subset(reglas_1, size(reglas_1)==2)
#inspect(reglas_tam_2 |> sort(by = 'hyper_lift')) 
plot(reglas_1 |> subset(support > 0.2), engine = "plotly")
library(tidygraph)
## 
## Attaching package: 'tidygraph'
## The following object is masked from 'package:stats':
## 
##     filter
library(ggraph)
df_reglas <- reglas_tam_2 |> DATAFRAME() |> rename(from=LHS, to=RHS) |> data.frame()
df_reglas$weight <- log(df_reglas$lift)
graph_1 <- as_tbl_graph(df_reglas) |>
  mutate(centrality = centrality_degree(mode = "all")) 
set.seed(881)
ggraph(graph_1, layout = 'fr') +
  geom_edge_link(aes(alpha=lift), 
                 colour = 'red',
                 arrow = arrow(length = unit(4, 'mm'))) + 
  geom_node_point(aes(size = centrality, colour = centrality)) + 
  geom_node_text(aes(label = name), size=4,
                 colour = 'gray20', repel=TRUE) +
  theme_graph(base_family = "sans")

Análisis de reglas con hyperlift alto

reglas_1 <- subset(reglas_recetas, hyper_lift > 1.5 & confidence > 0.1)
length(reglas_1)
## [1] 16244
reglas_tam_2 <- subset(reglas_1, size(reglas_1) == 2)
length(reglas_tam_2)
## [1] 135
library(tidygraph)
library(ggraph)
df_reglas <- reglas_tam_2 |> DATAFRAME() |> 
  rename(from=LHS, to=RHS) |> as_data_frame()
## Warning: `as_data_frame()` was deprecated in tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
df_reglas$weight <- log(df_reglas$hyper_lift)
graph_1 <- as_tbl_graph(df_reglas) |>
  mutate(centrality = centrality_degree(mode = "all")) 
ggraph(graph_1, layout = 'fr', start.temp=100) +
  geom_edge_link(aes(alpha=lift), 
                 colour = 'red',
                 arrow = arrow(length = unit(4, 'mm'))) + 
  geom_node_point(aes(size = centrality, colour = centrality)) + 
  geom_node_text(aes(label = name), size=4,
                 colour = 'gray20', repel=TRUE) +
  theme_graph(base_family = "sans")
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Exportamos para examinar en Gephi:

#write_csv(df_reglas |> rename(source=from, target=to) |>
#            select(-count), 'reglas.csv')

Nota

La combinación corn y starch puede deberse en parte a una separación incorrecta en el procesamiento de los datos (corn starch o maizena convertido en dos ingredientes, corn y starch):

#df_reglas |> filter(from == "{corn}", to == "{starch}")

La confianza es considerablemente alta, aunque tenemos pocos datos de esta combinación. Podemos examinar algunos ejemplos:

#recetas |> keep(~ "starch" %in% .x & "corn" %in% .x) |> head(10)